/*******************************************************************************
* --------------------------------------------------------------------------- *
* File: * @(#) MSXMLParser.java * Author: * Ning Zhang
* nzhang@systemsbiology.org
* ****************************************************************************** * * *
* This software is provided ``AS IS'' and any express or implied * *
* warranties, including, but not limited to, the implied warranties of * *
* merchantability and fitness for a particular purpose, are disclaimed. * * In
* no event shall the authors or the Institute for Systems Biology * * liable
* for any direct, indirect, incidental, special, exemplary, or * *
* consequential damages (including, but not limited to, procurement of * *
* substitute goods or services; loss of use, data, or profits; or * * business
* interruption) however caused and on any theory of liability, * * whether in
* contract, strict liability, or tort (including negligence * * or otherwise)
* arising in any way out of the use of this software, even * * if advised of
* the possibility of such damage. * * *
* ******************************************************************************/
package org.systemsbiology.jrap.stax;
import javax.xml.stream.XMLStreamReader;
import java.io.*;
import java.util.*;
/**
* A generic utility class for reading an MSXML file in a random access fashion
* and utilizing a stored scan index for fast reads.
*
* tholzman 200911xx
* -Inserting changes for compatibility with sequential scan iterators
*
* To use this parser with sequential access instead of random access
* use this constructor:
*
* new MSXMLParser(path,true)
*
* If you use "false" as the second argument, you will construct a regular
* jrap parser, just as if you had used the "new MSXMLParser(path)" constructor.
* The original constructor still works.
*
* To get the next scanHeader use
*
* ScanHeader scan = parser.nextHeader();
*
* Note the absence of the scan index as a parameter. This routine returns
* the information from the next scan, sequentially in the data file.
*
* It would be a bad idea to intermix nextHeader with rapHeader calls. The
* idea of the sequential parser is to avoid slow random access seeks. In
* the sequential version the scan index is built up gradually as you "next"
* through the file. When nextHeader returns null, the parsing is complete,
* and a call to getOffsets() will return the completed map. Each returned
* ScanHeader object will contain the correct file offset. The map from
* getOffsets() will only contain the file offsets up to the current scan.
*
* One final note: any XML errors generated by this parser have been recast
* as IOExceptions. This keeps the parser compatible with previous code.
*
**/
public final class MSXMLParser
{
/** The file we are in charge of reading */
protected String fileName = null;
/** The indexes */
protected Map<Integer, Long> offsets;
protected int maxScan;
protected long chrogramIndex;
protected boolean isXML = false;
protected boolean isML = false;
/* TAH Nov 2009 */
int currentScanIndex;
EndPatternStringIterator epsi = null;
public void setEpsi(EndPatternStringIterator e) {
this.epsi = e;
}
public EndPatternStringIterator getEpsi(){
return epsi;
}
public static boolean isMzXML(String fn) {
//dhmay 20100223, changing this check to a case-insensitive check
return fn.toLowerCase().indexOf("mzxml") != -1;
}
private void commonInits(String fileName) {
if (isMzXML(fileName))
isXML = true;
else
isML = true;
this.fileName = fileName;
}
private void sequentialInits() throws IOException {
String leftPat="<msRun", rightPat=">", attr = "scanCount";
String nextLeftPat="<scan",nextRightPat="</peaks>";
if(isML){
leftPat = "<spectrumList"; rightPat = ">"; attr = "count";
nextLeftPat="<spectrum"; nextRightPat="</spectrum>";
}
setEpsi(new EndPatternStringIterator(leftPat,rightPat,fileName));
XMLStreamReader xmlSR = epsi.xmlsrNext();
try {xmlSR.next();} catch (Exception e) {throw new IOException(e);};
maxScan = Integer.parseInt(xmlSR.getAttributeValue(null,attr));
offsets = new HashMap<Integer,Long>();
getEpsi().setLeftPatStr(nextLeftPat);
getEpsi().setRightPatStr(nextRightPat);
currentScanIndex = 0;
}
private void randomInits() {
//using IndexParser get indexes
IndexParser indexParser = new IndexParser(fileName);
indexParser.parseIndexes();
offsets = indexParser.getOffsetMap();
maxScan = indexParser.getMaxScan();
chrogramIndex = indexParser.getChrogramIndex();
}
public MSXMLParser(String fn, boolean isSequential) throws IOException {
commonInits(fn);
if(isSequential) {
sequentialInits();
} else {
randomInits();
}
}
public MSXMLParser(String fileName) {
commonInits(fileName);
randomInits();
}
/* end TAH */
/**this gives back the file header (info before scan)
*@return the file header info (MZXMLFileInfo)
*/
public MZXMLFileInfo rapFileHeader()
{
FileHeaderParser fileParser = new FileHeaderParser(fileName);
fileParser.parseFileHeader();
return (fileParser.getInfo());
}
/**
*@return a scan header object without peaks information.
* dhmay changing 20091021 to set the scanOffset on the returned scanHeader. This was earlier behavior that
* was removed by Ning. Replacing, because it increases efficiency quite a bit for calling code.
*/
public ScanHeader rapHeader(int scanNumber)
{
FileInputStream fileIN = null;
long scanOffset = -1;
try
{
fileIN = new FileInputStream(fileName);
scanOffset = getScanOffset(scanNumber);
if (scanOffset == -1)
{
return null;
}
fileIN.skip(scanOffset);
}
catch (Exception e)
{
System.out.println("File exception:" + e);
e.printStackTrace();
}
ScanHeader scanHeader = null;
if(isXML)
{
ScanAndHeaderParser headerParser = new ScanAndHeaderParser();
headerParser.setIsScan(false);
headerParser.setFileInputStream(fileIN);
headerParser.parseScanAndHeader();
closeFile(fileIN);
scanHeader = headerParser.getHeader();
}
else
{
MLScanAndHeaderParser headerParser = new MLScanAndHeaderParser();
headerParser.setIsScan(false);
headerParser.setFileInputStream(fileIN);
headerParser.parseMLScanAndHeader();
closeFile(fileIN);
scanHeader = headerParser.getHeader();
}
scanHeader.setScanOffset(scanOffset);
return scanHeader;
}
/* TAH Nov 2009 */
public ScanHeader nextHeader()
{
ScanHeader scanHeader = null;
StringBuilder curScanInfo = epsi.next();
if(curScanInfo == null || curScanInfo.length() == 0 || curScanInfo.charAt(0) != '<') return null;
currentScanIndex++;
if(isXML) {
ScanAndHeaderParser headerParser = new ScanAndHeaderParser();
headerParser.setIsScan(false);
try{headerParser.parseScanAndHeader(epsi.xmlsrCur());}
catch (Exception e){};
scanHeader = headerParser.getHeader();
} else {
MLScanAndHeaderParser headerParser = new MLScanAndHeaderParser();
headerParser.setIsScan(false);
try{headerParser.parseMLScanAndHeader(epsi.xmlsrCur());}
catch (Exception e){};
scanHeader = headerParser.getHeader();
}
offsets.put(scanHeader.getNum(),epsi.getFilePos());
scanHeader.setScanOffset(epsi.getFilePos());
return scanHeader;
}
public Map<Integer,Long>getOffsets() {
return offsets;
}
/* end TAH */
private void closeFile(FileInputStream fileIN) {
if(fileIN != null) {
try {
fileIN.close();
}
catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* Read a particular scan from a MSXML file and return a generic Scan object
* with it's data. Note: scanNumbers are 1-based, so scanNumber must be at
* least 1 and be not greater than getScanCount() + 1
*@return a scan object. It has all the infomation in a scanheader object and also
* peaks information that doesn't included in scanHeader object.
*/
public Scan rap(int scanNumber)
{
FileInputStream fileIN = null;
try
{
fileIN = new FileInputStream(fileName);
long scanOffset = getScanOffset(scanNumber);
if (scanOffset == -1)
{
return null;
}
fileIN.skip(scanOffset);
} catch (Exception e)
{
System.out.println("File exception:" + e);
e.printStackTrace();
}
if(isXML)
{
ScanAndHeaderParser scanParser = new ScanAndHeaderParser();
scanParser.setIsScan(true);
scanParser.setFileInputStream(fileIN);
scanParser.parseScanAndHeader();
closeFile(fileIN);
return ( scanParser.getScan());
}
else
{
MLScanAndHeaderParser scanParser = new MLScanAndHeaderParser();
scanParser.setIsScan(true);
scanParser.setFileInputStream(fileIN);
scanParser.parseMLScanAndHeader();
closeFile(fileIN);
return (scanParser.getScan());
}
}
/**
* Get the total number of scans in the mzXMLfile handled by this parser.
*
* @return The number of scans.
*/
public int getScanCount() /* TAH Nov 2009 */
{
if(epsi != null) { //sequential scan, index scan hasn't been run
return maxScan;
} else {
return offsets.size();
}
} /* end TAH */
public int getMaxScanNumber()
{
return maxScan;
}
/**
*get scan offset, scan number is 1 based.
*/
public long getScanOffset(int scanNumber)
{
if (scanNumber > 0 && offsets.containsKey(scanNumber))
{
return ((offsets.get(scanNumber)).longValue());
} else
{
return (-1);
}
}
}